Dual CRISPR Screen Analysis

Step 1: Construct Scaffold Trimming

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_num_processors = 3
g_fastqs_dir = '~/dual_crispr/test_data/test_set_1'
g_trimmed_fastqs_dir = '~/dual_crispr/test_outputs/test_set_1'
g_full_5p_r1 = 'TATATATCTTGTGGAAAGGACGAAACACCG'
g_full_5p_r2 = 'CCTTATTTTAACTTGCTATTTCTAGCTCTAAAAC'
g_full_3p_r1 = 'GTTTCAGAGCTATGCTGGAAACTGCATAGCAAGTTGAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTACTGAG'
g_full_3p_r2 = 'CAAACAAGGCTTTTCTCCAAGGGATATTTATAGTCTCAAAACACACAATTACTTTACAGTTAGGGTGAGTTTCCTTTTGTGCTGTTTTTTAAAATA'
g_keep_gzs = False  # True only works for gzip 1.6+ (apparently not available on AWS linux)

Automated Set-Up


In [ ]:
import inspect

import ccbb_pyutils.analysis_run_prefixes as ns_runs
import ccbb_pyutils.files_and_paths as ns_files
import ccbb_pyutils.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [ ]:
g_fastqs_dir = ns_files.expand_path(g_fastqs_dir)
g_trimmed_fastqs_dir = ns_files.expand_path(ns_runs.check_or_set(g_trimmed_fastqs_dir, g_fastqs_dir))
print(describe_var_list(['g_fastqs_dir','g_trimmed_fastqs_dir']))
ns_files.verify_or_make_dir(g_trimmed_fastqs_dir)

Scaffold Trimming Functions


In [ ]:
import dual_crispr.scaffold_trim as trim
print(inspect.getsource(trim))

In [ ]:
def trim_fw_and_rv_reads(output_dir, full_5p_r1, full_3p_r1, full_5p_r2, full_3p_r2, fw_fastq_fp, rv_fastq_fp):        
    trim.trim_linked_scaffold(output_dir, fw_fastq_fp, full_5p_r1, full_3p_r1)
    trim.trim_linked_scaffold(output_dir, rv_fastq_fp, full_5p_r2, full_3p_r2)

Gzipped FASTQ Filenames


In [ ]:
g_seq_file_ext_name = ".fastq"
g_gzip_ext_name = ".gz"

In [ ]:
print(ns_files.check_file_presence(g_fastqs_dir, "",  "{0}{1}".format(g_seq_file_ext_name, g_gzip_ext_name), 
                                   all_subdirs=True, check_failure_msg=None, just_warn=True))

FASTQ Gunzip Execution


In [ ]:
import ccbb_pyutils.files_and_paths as ns_files

def unzip_and_flatten_seq_files(top_fastqs_dir, ext_name, gzip_ext_name, keep_gzs):
    # first, recursively unzip all fastq.gz files anywhere under the input dir
    ns_files.gunzip_wildpath(top_fastqs_dir, ext_name + gzip_ext_name, keep_gzs, True)  # True = do recursive
    # now move all fastqs to top-level directory so don't have to work recursively in future
    ns_files.move_to_dir_and_flatten(top_fastqs_dir, top_fastqs_dir, ext_name)

In [ ]:
# False = don't keep gzs as well as expanding, True = do keep them (True only works for gzip 1.6+)
unzip_and_flatten_seq_files(g_fastqs_dir, g_seq_file_ext_name, g_gzip_ext_name, g_keep_gzs)

FASTQ Filenames


In [ ]:
print(ns_files.check_file_presence(g_fastqs_dir, "", g_seq_file_ext_name, 
                                   check_failure_msg="No fastq files to trim were detected."))

Scaffold Trim Execution


In [ ]:
import ccbb_pyutils.parallel_process_fastqs as ns_parallel
g_parallel_results = ns_parallel.parallel_process_paired_reads(g_fastqs_dir, g_seq_file_ext_name, g_num_processors, 
                                                   trim_fw_and_rv_reads, [g_trimmed_fastqs_dir, g_full_5p_r1, 
                                                                          g_full_3p_r1, g_full_5p_r2, g_full_3p_r2])

In [ ]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))

Trimmed FASTQ Filenames


In [ ]:
print(ns_files.check_file_presence(g_trimmed_fastqs_dir, "", trim.get_trimmed_suffix(trim.TrimType.FIVE_THREE),
                                  check_failure_msg="Scaffold trimming failed to produce trimmed file(s)."))